Flat Rent Price Prediction in Berlin with Web Scraping: Towards the Digital Transformation in Official Statistics¶

C.5 Ordinary Least Squares¶

In [1]:
# Import required libraries

library(miceadds)
library(miceafter)
library(matlib)
library(lmtest)
library("sandwich")
library(lattice)
library(ggplot2)
library(modelsummary)

options(repr.plot.width=10, repr.plot.height=8)
Lade nötiges Paket: mice


Attache Paket: ‘mice’


Das folgende Objekt ist maskiert ‘package:stats’:

    filter


Die folgenden Objekte sind maskiert von ‘package:base’:

    cbind, rbind


* miceadds 3.13-12 (2022-05-30 15:14:07)

Lade nötiges Paket: zoo


Attache Paket: ‘zoo’


Die folgenden Objekte sind maskiert von ‘package:base’:

    as.Date, as.Date.numeric


In [2]:
# Set work directory

setwd("/Users/milomeyberg/Documents/Immonet_Oktober2/immonet")

# Load mids object

load("mice_imp_ohne_flux_final/mice_imp_ohne_flux_final.Rdata")
In [3]:
# Generate list of completed data.frames
dfs<- complete(get("mi.res"), action='long', include=TRUE)

# Add squared and cubic term of the living area
dfs$WOHNFLAECHE_SQUARED <- dfs$WOHNFLAECHE^2
dfs$WOHNFLAECHE_CUBIC <- dfs$WOHNFLAECHE^3

# Utility Costs 

dfs$NEBENKOSTEN_SQUARED <- dfs$NEBENKOSTEN_M2^2

# Scale the year of construction back

dfs$BAUJAHR_NOT_ROUNDED <- ((dfs$BAUJAHR*1000)-2000)
dfs$BAUJAHR_SQUARED_NOT_ROUNDED <- ((dfs$BAUJAHR^2)-(2000^2))
dfs$BAUJAHR <- (round(dfs$BAUJAHR*1000)-2000)
dfs$BAUJAHR_SQUARED <- ((dfs$BAUJAHR^2))



#Turn back to mids   
dfs <- as.mids(dfs)
In [4]:
# Formatting outputs for model summary table

glance_custom.mipo <- function(x, ...) {
  ret <- as.data.frame(t(apply((x$glanced[c(1:2,5,8:9)]),MARGIN=2,FUN=mean)))
  ret
}

1. Using Quadratic Relationship for Year of Construction¶

In [5]:
# Generate models for each imputed dataset

ra0 <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED + HEIZUNGSKOSTEN_M2 + ZIMMER +
                             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC + BAUJAHR + BAUJAHR_SQUARED + Zentralheizung 
                           + Gas + Fernwärme))


ra1 <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED + HEIZUNGSKOSTEN_M2
                           + WOHNFLAECHE + WOHNFLAECHE_SQUARED  + WOHNFLAECHE_CUBIC + BAUJAHR + BAUJAHR_SQUARED + Zentralheizung 
                           + Gas))

ra <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED 
                          + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC  + BAUJAHR + BAUJAHR_SQUARED + Zentralheizung 
                          + Gas))

ra2 <- with(dfs, expr = lm(KALTMIETE ~ NEBENKOSTEN + NEBENKOSTEN_SQUARED 
                           + WOHNFLAECHE + WOHNFLAECHE_SQUARED  + WOHNFLAECHE_CUBIC + Neubau + Zentralheizung 
                           + Gas))


# Build a list of models
models <- list( pool(ra0),pool(ra1),pool(ra))


# Generates model summary table
modelsummary(models, fmt = "%.5f",estimate = "{estimate}{stars}",  coef_rename = c("NEBENKOSTEN_M2" = "Utility Costs", 
                                                                                   "NEBENKOSTEN_SQUARED"="Utility Costs Squared",
                                                                                   "HEIZUNGSKOSTEN_M2" = "Heating Costs",
                                                                                   "ZIMMER" = "Rooms Number","WOHNFLAECHE" = "Living Area",
                                                                                   "WOHNFLAECHE_SQUARED" = "Living Area Squared",
                                                                                   "WOHNFLAECHE_CUBIC" = "Cubic Living Area",
                                                                                   "BAUJAHR" = "Construction Year",
                                                                                   "BAUJAHR_SQUARED" = "Construcion Year Squared",
                                                                                   "Zentralheizung1" = "Central Heating",
                                                                                   "Gas1" = "Gas", "Fernwärme1"= "District Heating"),output = 'jupyter')
Model 1 Model 2 Model 3
(Intercept) 19.40555*** 19.08611*** 19.12490***
(1.42538) (1.36730) (1.17332)
Utility Costs −0.73736+ −0.71533+ −0.71293+
(0.40889) (0.40609) (0.41462)
Utility Costs Squared 0.29515*** 0.29225*** 0.29236***
(0.06294) (0.06246) (0.06179)
Heating Costs 0.01158 0.02602
(0.24903) (0.24929)
Rooms Number −0.02126
(0.17710)
Living Area −0.20274*** −0.20408*** −0.20430***
(0.03713) (0.03534) (0.03456)
Living Area Squared 0.00225*** 0.00226*** 0.00226***
(0.00039) (0.00038) (0.00038)
Cubic Living Area −0.00001*** −0.00001*** −0.00001***
(0.00000) (0.00000) (0.00000)
Construction Year 0.13549*** 0.13456*** 0.13456***
(0.00580) (0.00586) (0.00588)
Construcion Year Squared 0.00146*** 0.00145*** 0.00145***
(0.00006) (0.00006) (0.00006)
Central Heating −1.35902* −1.35287* −1.34914*
(0.41294) (0.41826) (0.41524)
Gas 0.25570 0.48428* 0.48573*
(0.32373) (0.22319) (0.21819)
District Heating −0.32036
(0.29879)
Num.Obs. 2950 2950 2950
Num.Imp. 5 5 5
R2 0.320 0.320 0.319
R2 Adj. 0.317 0.317 0.317
AIC 17503.3 17501.3 17501.2
BIC 17587.2 17573.2 17567.0
In [6]:
# Print the p-value of the overall models

print(paste("Model 1: ",glance_custom.mipo(models[[1]])$p.value))
print(paste("Model 2: ",glance_custom.mipo(models[[2]])$p.value))
print(paste("Model 3: ",glance_custom.mipo(models[[3]])$p.value))
[1] "Model 1:  2.20333948809292e-229"
[1] "Model 2:  1.74622816600423e-231"
[1] "Model 3:  2.17637295292107e-232"
In [7]:
# Pooling of the selected model

poolm <- pool(ra)

# -------- Deriving the full variance covariance matrix ------------------ 
# -------(Taken from a Thread of https://stackoverflow.com)--------------- 

# m different imputations

m <- poolm$m

# Vw, the within imputation covariance matrix

vw <- Reduce("+", lapply(ra$analyses, vcov)) / (m)

# Vb, the between imputation covariance matrix
# mice only provides the diagonal elements

bdiag <- poolm$pooled$b

# Calculate the full Vb matrix by hand

# Qbar, pooled parameter estimates

qbar <- getqbar(poolm)

# Qhats, each imputations parameter estimates

qhats <- sapply(ra$analyses, coef)
vb <- (1 / (m-1)) * (qhats - qbar) %*% t(qhats - qbar)

vt <- vw + (1 + 1 / (m)) * vb  # this is t as it used to be


# Checking against the diagonal of t that is still provided

all.equal(as.numeric(diag(vt)), poolm$pooled$t)


# Convert covariance matrix to correlation matrix 

corr <- solve(diag(sqrt(diag(vt)))) %*% vt %*% solve(diag(sqrt(diag(vt))))

corr_2 <- cov2cor(vt)
TRUE
In [8]:
# Corelation matrix of betha
corr_2
A matrix: 10 × 10 of type dbl
(Intercept)NEBENKOSTEN_M2NEBENKOSTEN_SQUAREDWOHNFLAECHEWOHNFLAECHE_SQUAREDWOHNFLAECHE_CUBICBAUJAHRBAUJAHR_SQUAREDZentralheizung1Gas1
(Intercept) 1.00000000-0.54908070 0.53507033-0.83604381 0.768735331-0.710737552 0.16217464 0.085159263 0.0866590-0.025662787
NEBENKOSTEN_M2-0.54908070 1.00000000-0.96781602 0.12574763-0.096174855 0.088883890-0.33365539-0.190617526-0.4081307-0.187689625
NEBENKOSTEN_SQUARED 0.53507033-0.96781602 1.00000000-0.12995706 0.100751509-0.096018623 0.26936047 0.164507065 0.3063020 0.167908072
WOHNFLAECHE-0.83604381 0.12574763-0.12995706 1.00000000-0.978940358 0.933850390-0.09023771-0.045955357-0.1269911-0.024752816
WOHNFLAECHE_SQUARED 0.76873533-0.09617486 0.10075151-0.97894036 1.000000000-0.984328676 0.04858789 0.008966544 0.1242373 0.007872795
WOHNFLAECHE_CUBIC-0.71073755 0.08888389-0.09601862 0.93385039-0.984328676 1.000000000-0.03057400-0.001389355-0.1135257-0.002507977
BAUJAHR 0.16217464-0.33365539 0.26936047-0.09023771 0.048587892-0.030574003 1.00000000 0.872085220 0.5342747 0.287987085
BAUJAHR_SQUARED 0.08515926-0.19061753 0.16450706-0.04595536 0.008966544-0.001389355 0.87208522 1.000000000 0.2954733 0.094674245
Zentralheizung1 0.08665900-0.40813070 0.30630202-0.12699108 0.124237252-0.113525681 0.53427467 0.295473310 1.0000000 0.437672310
Gas1-0.02566279-0.18768962 0.16790807-0.02475282 0.007872795-0.002507977 0.28798708 0.094674245 0.4376723 1.000000000
In [9]:
# Generate Residuals and Effect Plots

dfs_1<- lapply(1:5, function(i) complete(dfs, action = i))

for (i in 1:5){
  model<- lm(data=dfs_1[[i]],KALTMIETE_M2 ~ NEBENKOSTEN_M2
             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC  + BAUJAHR + BAUJAHR_SQUARED + Zentralheizung 
             + Gas) 
  res <- resid(model)
  dfs_1[[i]]$fit <- fitted(model)
  #dfs_1[[i]]$res <- rstandard(model)
  dfs_1[[i]]$Effect <- model$coefficients["(Intercept)"] + model$coefficients["WOHNFLAECHE"]*dfs_1[[i]]$WOHNFLAECHE + model$coefficients["WOHNFLAECHE_SQUARED"]* (dfs_1[[i]]$WOHNFLAECHE_SQUARED) + model$coefficients["WOHNFLAECHE_CUBIC"]*(dfs_1[[i]]$WOHNFLAECHE_CUBIC)
  dfs_1[[i]]$Effect_2 <- model$coefficients["(Intercept)"] + model$coefficients["BAUJAHR"]*dfs_1[[i]]$BAUJAHR + model$coefficients["BAUJAHR_SQUARED"]* (dfs_1[[i]]$BAUJAHR_SQUARED)
    
  
 #produce residual vs. fitted plot
    
 print(ggplot(data.frame(fitted(model),res),aes(fitted(model),res)) +
  geom_point(alpha=0.6) + stat_smooth(method='loess')+ theme_bw()+
   xlab('Predicted Values') + ylab('Residuals') +
  theme(axis.title = element_text(face="bold"))) 
    
  #plot effect of the living area
    
  print(qplot(WOHNFLAECHE, Effect,data = dfs_1[[i]], 
              geom = c("point")) + theme_bw() 
        + ggtitle(substitute(paste(bold('Effect of the Living Area')))) +   theme(plot.title = element_text(hjust = 0.5))
        + labs(x= expression(bold("Living Area in") ~bold(m^2)), y=substitute(paste(bold('Effect')))
              ))
  
  #plot effect of the year of construction
  print(qplot(BAUJAHR+2000, Effect_2,data = dfs_1[[i]], 
              geom = c("point")) + theme_bw()
        + ggtitle(substitute(paste(bold('Effect of the Year of Construction')))) +   theme(plot.title = element_text(hjust = 0.5))
        + labs(x= substitute(bold(paste('Year of Construction'))), y=substitute(bold(paste('Effect')))))
  
  print(bptest(model))
}
`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 263.72, df = 8, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 247.03, df = 8, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 249.95, df = 8, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 250.47, df = 8, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 250.37, df = 8, p-value < 2.2e-16

In [10]:
# Print minimun prediction in each dataset

print(paste("1:",min(dfs_1[[1]]$fit)))
print(paste("2:",min(dfs_1[[2]]$fit)))
print(paste("3:",min(dfs_1[[3]]$fit)))
print(paste("4:",min(dfs_1[[4]]$fit)))
print(paste("5:",min(dfs_1[[5]]$fit)))
[1] "1: 3.04925831315137"
[1] "2: 2.85396290567098"
[1] "3: 3.82577662938494"
[1] "4: 3.04276043266461"
[1] "5: 3.83326986467252"

2. Using Dummy Variable for Year of Construction¶

In [11]:
# Generate models for each imputed dataset

ra0 <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED + HEIZUNGSKOSTEN_M2 + ZIMMER +
                             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC  + Neubau + Zentralheizung 
                           + Gas + Fernwärme))


ra1 <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED + HEIZUNGSKOSTEN_M2
                           + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC   + Neubau + Zentralheizung 
                           + Gas))

ra <- with(dfs, expr = lm(KALTMIETE_M2 ~ NEBENKOSTEN_M2 + NEBENKOSTEN_SQUARED 
                          + WOHNFLAECHE +WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC  + Neubau + Zentralheizung 
                          + Gas))


# Build a list of models
models <- list( pool(ra0),pool(ra1),pool(ra))

# Generates model summary table

modelsummary(models, fmt = "%.4f",estimate = "{estimate}{stars}",  coef_rename = c("NEBENKOSTEN_M2" = "Utility Costs", 
                                                                                   "NEBENKOSTEN_SQUARED"="Utility Costs Squared",
                                                                                   "HEIZUNGSKOSTEN_M2" = "Heating Costs",
                                                                                   "ZIMMER" = "Rooms Number","WOHNFLAECHE" = "Living Area",
                                                                                   "WOHNFLAECHE_SQUARED" = "Living Area Squared",
                                                                                   "WOHNFLAECHE_CUBIC" = "Cubic Living Area",
                                                                                   "Neubau" = "New Construction",
                                                                                   "Zentralheizung1" = "Central Heating",
                                                                                   "Gas1" = "Gas", "Fernwärme1"= "District Heating"),output="jupyter")
Model 1 Model 2 Model 3
(Intercept) 18.0497*** 17.9116*** 17.9893***
(1.4580) (1.3880) (1.1918)
Utility Costs −1.1715* −1.1510* −1.1463*
(0.4534) (0.4514) (0.4619)
Utility Costs Squared 0.3584*** 0.3557*** 0.3559***
(0.0685) (0.0681) (0.0673)
Heating Costs 0.0420 0.0498
(0.2622) (0.2622)
Rooms Number −0.0646
(0.1798)
Living Area −0.2042*** −0.2081*** −0.2085***
(0.0381) (0.0363) (0.0355)
Living Area Squared 0.0024*** 0.0024*** 0.0024***
(0.0004) (0.0004) (0.0004)
Cubic Living Area −0.0000*** −0.0000*** −0.0000***
(0.0000) (0.0000) (0.0000)
New Construction 4.8587*** 4.8369*** 4.8347***
(0.2689) (0.2697) (0.2723)
Central Heating −1.5017* −1.4968* −1.4933*
(0.4493) (0.4506) (0.4473)
Gas 0.7029* 0.8204** 0.8223***
(0.3349) (0.2313) (0.2280)
District Heating −0.1639
(0.3081)
Num.Obs. 2950 2950 2950
Num.Imp. 5 5 5
R2 0.295 0.295 0.295
R2 Adj. 0.293 0.293 0.293
AIC 17607.2 17604.1 17604.3
BIC 17685.1 17670.0 17664.2
In [12]:
# Print the p-value of the overall models

print(paste("Model 1: ",glance_custom.mipo(models[[1]])$p.value))
print(paste("Model 2: ",glance_custom.mipo(models[[2]])$p.value))
print(paste("Model 3: ",glance_custom.mipo(models[[3]])$p.value))
[1] "Model 1:  4.15942745579809e-212"
[1] "Model 2:  4.00052600130179e-214"
[1] "Model 3:  4.63010050005655e-215"
In [13]:
# Pooling of the selected model

poolm <- pool(ra)

# -------- Deriving the full variance covariance matrix ------------------ 
# -------(Taken from a Thread of https://stackoverflow.com)--------------- 

# m different imputations

m <- poolm$m

# Vw, the within imputation covariance matrix

vw <- Reduce("+", lapply(ra$analyses, vcov)) / (m)

# Vb, the between imputation covariance matrix
# mice only provides the diagonal elements

bdiag <- poolm$pooled$b

# Calculate the full Vb matrix by hand

# Qbar, pooled parameter estimates

qbar <- getqbar(poolm)

# Qhats, each imputations parameter estimates

qhats <- sapply(ra$analyses, coef)
vb <- (1 / (m-1)) * (qhats - qbar) %*% t(qhats - qbar)

vt <- vw + (1 + 1 / (m)) * vb  # this is t as it used to be


# Checking against the diagonal of t that is still provided

all.equal(as.numeric(diag(vt)), poolm$pooled$t)


# Convert covariance matrix to correlation matrix 

corr <- solve(diag(sqrt(diag(vt)))) %*% vt %*% solve(diag(sqrt(diag(vt))))

varcorr <- cov2cor(vt)
TRUE
In [14]:
# Corelation matrix of betha
corr_2
A matrix: 10 × 10 of type dbl
(Intercept)NEBENKOSTEN_M2NEBENKOSTEN_SQUAREDWOHNFLAECHEWOHNFLAECHE_SQUAREDWOHNFLAECHE_CUBICBAUJAHRBAUJAHR_SQUAREDZentralheizung1Gas1
(Intercept) 1.00000000-0.54908070 0.53507033-0.83604381 0.768735331-0.710737552 0.16217464 0.085159263 0.0866590-0.025662787
NEBENKOSTEN_M2-0.54908070 1.00000000-0.96781602 0.12574763-0.096174855 0.088883890-0.33365539-0.190617526-0.4081307-0.187689625
NEBENKOSTEN_SQUARED 0.53507033-0.96781602 1.00000000-0.12995706 0.100751509-0.096018623 0.26936047 0.164507065 0.3063020 0.167908072
WOHNFLAECHE-0.83604381 0.12574763-0.12995706 1.00000000-0.978940358 0.933850390-0.09023771-0.045955357-0.1269911-0.024752816
WOHNFLAECHE_SQUARED 0.76873533-0.09617486 0.10075151-0.97894036 1.000000000-0.984328676 0.04858789 0.008966544 0.1242373 0.007872795
WOHNFLAECHE_CUBIC-0.71073755 0.08888389-0.09601862 0.93385039-0.984328676 1.000000000-0.03057400-0.001389355-0.1135257-0.002507977
BAUJAHR 0.16217464-0.33365539 0.26936047-0.09023771 0.048587892-0.030574003 1.00000000 0.872085220 0.5342747 0.287987085
BAUJAHR_SQUARED 0.08515926-0.19061753 0.16450706-0.04595536 0.008966544-0.001389355 0.87208522 1.000000000 0.2954733 0.094674245
Zentralheizung1 0.08665900-0.40813070 0.30630202-0.12699108 0.124237252-0.113525681 0.53427467 0.295473310 1.0000000 0.437672310
Gas1-0.02566279-0.18768962 0.16790807-0.02475282 0.007872795-0.002507977 0.28798708 0.094674245 0.4376723 1.000000000
In [15]:
# Generate Residuals and Effect Plots

dfs_1<- lapply(1:5, function(i) complete(dfs, action = i))

for (i in 1:5){
  model<- lm(data=dfs_1[[i]],KALTMIETE_M2 ~ NEBENKOSTEN_M2 + ZIMMER
             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC + Neubau + Zentralheizung 
             + Gas + Fernwärme) 
  res <- resid(model)
  dfs_1[[i]]$fit <- fitted(model)
  #dfs_1[[i]]$res <- rstandard(model)
  dfs_1[[i]]$Effect <- model$coefficients["(Intercept)"] + model$coefficients["WOHNFLAECHE"]*dfs_1[[i]]$WOHNFLAECHE + model$coefficients["WOHNFLAECHE_SQUARED"]* (dfs_1[[i]]$WOHNFLAECHE_SQUARED) + model$coefficients["WOHNFLAECHE_CUBIC"]*(dfs_1[[i]]$WOHNFLAECHE_CUBIC)
  
  #produce residual vs. fitted plot
 print(ggplot(data.frame(fitted(model),res),aes(fitted(model),res)) +
  geom_point(alpha=0.6) + stat_smooth(method='loess')+ theme_bw()+
   xlab('Predicted Values') + ylab('Residuals') +
  theme(axis.title = element_text(face="bold"))) 
  
  #plot effect of the living area 
  print(qplot(WOHNFLAECHE, Effect,data = dfs_1[[i]], 
              geom = c("point")) + theme_bw() +
        ggtitle(substitute(paste(bold('Effect of the Living Area')))) +   theme(plot.title = element_text(hjust = 0.5))
        + labs(x= expression(bold("Living Area in") ~bold(m^2)), y=substitute(paste(bold('Effect')))
              ))
  
  print(bptest(model))
}
`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 183.84, df = 9, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 157.99, df = 9, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 184.86, df = 9, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 175.26, df = 9, p-value < 2.2e-16

`geom_smooth()` using formula 'y ~ x'

	studentized Breusch-Pagan test

data:  model
BP = 185.98, df = 9, p-value < 2.2e-16

In [16]:
# Print minimun prediction in each dataset

print(paste("1:",min(dfs_1[[1]]$fit)))
print(paste("2:",min(dfs_1[[2]]$fit)))
print(paste("3:",min(dfs_1[[3]]$fit)))
print(paste("4:",min(dfs_1[[4]]$fit)))
print(paste("5:",min(dfs_1[[5]]$fit)))
[1] "1: 3.11682478490676"
[1] "2: 2.55910213342987"
[1] "3: 3.84904019682308"
[1] "4: 1.73628279993135"
[1] "5: 2.87823769049346"

Compare the Effect of the Year of Construction between the two Schemes¶

In [17]:
# Generate Residuals and Effect Plots

dfs_1<- lapply(1:5, function(i) complete(dfs, action = i))

for (i in 1:5){
    
  model1<- lm(data=dfs_1[[i]],KALTMIETE_M2 ~ NEBENKOSTEN_M2
             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC + BAUJAHR + BAUJAHR_SQUARED + Zentralheizung 
             + Gas)
  model2<- lm(data=dfs_1[[i]],KALTMIETE_M2 ~ NEBENKOSTEN_M2 + ZIMMER
             + WOHNFLAECHE + WOHNFLAECHE_SQUARED + WOHNFLAECHE_CUBIC + Neubau + Zentralheizung 
             + Gas + Fernwärme)
    
  dfs_1[[i]]$Effect1 <- model1$coefficients["(Intercept)"] + model1$coefficients["BAUJAHR"]*dfs_1[[i]]$BAUJAHR + model1$coefficients["BAUJAHR_SQUARED"]*(dfs_1[[i]]$BAUJAHR_SQUARED)
  dfs_1[[i]]$Effect2 <- ifelse(dfs_1[[i]]$Neubau==1,model2$coefficients["(Intercept)"] + model2$coefficients["Neubau"]*dfs_1[[i]]$Neubau,NA)
  dfs_1[[i]]$Effect3 <- ifelse(dfs_1[[i]]$Neubau==0 & dfs_1[[i]]$BAUJAHR>=(-100),model2$coefficients["(Intercept)"] + model2$coefficients["Neubau"]*dfs_1[[i]]$Neubau,NA)
  dfs_1[[i]]$Effect4 <- model1$coefficients["(Intercept)"] + model1$coefficients["WOHNFLAECHE"]*dfs_1[[i]]$WOHNFLAECHE + model1$coefficients["WOHNFLAECHE_SQUARED"]* (dfs_1[[i]]$WOHNFLAECHE_SQUARED) + model1$coefficients["WOHNFLAECHE_CUBIC"]*(dfs_1[[i]]$WOHNFLAECHE_CUBIC)
  dfs_1[[i]]$Effect5 <- model2$coefficients["(Intercept)"] + model2$coefficients["WOHNFLAECHE"]*dfs_1[[i]]$WOHNFLAECHE + model2$coefficients["WOHNFLAECHE_SQUARED"]* (dfs_1[[i]]$WOHNFLAECHE_SQUARED) + model2$coefficients["WOHNFLAECHE_CUBIC"]*(dfs_1[[i]]$WOHNFLAECHE_CUBIC)
    
 #plot effect of the year of construction
  print(ggplot(data = dfs_1[[i]],aes(x=BAUJAHR+2000)) + theme_bw() 
        + geom_line(aes(y = Effect2), color = "darkred") 
        + geom_line(aes(y = Effect3), color = "darkred") 
        + geom_line(aes(y = Effect1), color="steelblue", linetype="twodash") 
        + geom_vline(aes(xintercept=2000), linetype=2, color="grey")
         + ggtitle(substitute(paste(bold('Effect of the Year of Construction')))) +   theme(plot.title = element_text(hjust = 0.5))
        + labs(x= substitute(bold(paste('Year of Construction'))), y=substitute(bold(paste('Effect')))))


               
 #plot effect of the living area
  print(ggplot(data = dfs_1[[i]],aes(x=WOHNFLAECHE)) + theme_bw() 
        + geom_line(aes(y = Effect4), color = "darkred")  
        + geom_line(aes(y = Effect5), color="steelblue", linetype="twodash") 
        + ggtitle(substitute(paste(bold('Effect of the Living Area')))) +   theme(plot.title = element_text(hjust = 0.5))
        + labs(x= expression(bold("Living Area in") ~bold(m^2)), y=substitute(bold(paste('Effect')))))

}
Warning message:
“Removed 1866 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1159 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1868 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1152 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1859 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1162 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1885 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1145 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1883 row(s) containing missing values (geom_path).”
Warning message:
“Removed 1142 row(s) containing missing values (geom_path).”